{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import re\n",
    "from nltk.corpus import stopwords\n",
    "from bs4 import BeautifulSoup\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer\n",
    "import lightgbm as lgb\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "stemmer = WordNetLemmatizer()\n",
    "from scipy import sparse\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import StrMethodFormatter\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from math import sqrt\n",
    "from gensim.models import Word2Vec\n",
    "import nltk\n",
    "tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')\n",
    "import sys\n",
    "import gensim\n",
    "from gensim.models.doc2vec import LabeledSentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_orig = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\Data_Train02.csv', encoding='ISO-8859-1')\n",
    "test_orig = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\Data_Test02.csv', encoding='ISO-8859-1')\n",
    "FeatureNames = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\FeatureNames02.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')\n",
    "test_other_models = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]\n",
    "test_other_models = test_other_models[['id','Price_Log_Pred']]\n",
    "\n",
    "train = pd.merge(train_orig, train_other_models, on='id')\n",
    "test = pd.merge(test_orig, test_other_models, on='id')\n",
    "\n",
    "train['Price_Log'] = np.log10(train['Price']+1)\n",
    "train.hist(column='Price_Log')\n",
    "\n",
    "FeatureNames = FeatureNames['x'].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleaning_text(review, remove_stopwords=False, Lem=False):\n",
    "    review_text = BeautifulSoup(review, \"html.parser\").get_text()\n",
    "    review_text = re.sub('[^a-zA-Z]',' ', review_text)\n",
    "    review_text = re.sub('\\s+',' ', review_text)\n",
    "    words = review_text.lower().split()\n",
    "    if remove_stopwords:\n",
    "        stops = set(stopwords.words(\"english\"))\n",
    "        words = [w for w in words if not w in stops]\n",
    "    if Lem:\n",
    "        words = [stemmer.lemmatize(w) for w in words] # Lemmatization\n",
    "    review_text = (' '.join([word for word in words]))\n",
    "    return(review_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,True))\n",
    "test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,True))\n",
    "\n",
    "train['Title2'] = train['Title'].apply(lambda x: cleaning_text(x,True,True))\n",
    "test['Title2'] = test['Title'].apply(lambda x: cleaning_text(x,True,True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def labelize_text(text,label):\n",
    "    result = []\n",
    "    prefix = label\n",
    "    for i, t in zip(text.index, text):\n",
    "        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_x1 = pd.concat([train.Synopsis2,test.Synopsis2])\n",
    "all_x2 = pd.concat([train.Title2,test.Title2])\n",
    "\n",
    "all_x1_w2v = labelize_text(all_x1, 'ALL')\n",
    "all_x2_w2v = labelize_text(all_x2, 'ALL')\n",
    "\n",
    "x_train1 = labelize_text(train.Synopsis2, 'TRAIN')\n",
    "x_validation1 = labelize_text(test.Synopsis2, 'TEST')\n",
    "\n",
    "x_train2 = labelize_text(train.Title2, 'TRAIN')\n",
    "x_validation2 = labelize_text(test.Title2, 'TEST')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models.word2vec import Word2Vec\n",
    "from tqdm import tqdm\n",
    "from sklearn import utils\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from gensim.models import Doc2Vec\n",
    "import multiprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time\n",
    "cores = multiprocessing.cpu_count()\n",
    "\n",
    "model_dbow1 = Doc2Vec(dm=0, size=300, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)\n",
    "model_dbow1.build_vocab([x for x in tqdm(all_x1_w2v)])\n",
    "model_dbow1.train(utils.shuffle([x for x in tqdm(all_x1_w2v)]), total_examples=len(all_x1_w2v), epochs=1)\n",
    "\n",
    "model_dbow2 = Doc2Vec(dm=0, size=300, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)\n",
    "model_dbow2.build_vocab([x for x in tqdm(all_x2_w2v)])\n",
    "model_dbow2.train(utils.shuffle([x for x in tqdm(all_x2_w2v)]), total_examples=len(all_x2_w2v), epochs=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time\n",
    "vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=1)\n",
    "matrix1 = vectorizer1.fit_transform([x.words for x in all_x1_w2v])\n",
    "tfidf1 = dict(zip(vectorizer1.get_feature_names(), vectorizer1.idf_))\n",
    "\n",
    "vectorizer2 = TfidfVectorizer(analyzer=lambda x: x, min_df=1)\n",
    "matrix2 = vectorizer2.fit_transform([x.words for x in all_x2_w2v])\n",
    "tfidf2 = dict(zip(vectorizer2.get_feature_names(), vectorizer2.idf_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_doc_Vector1(tokens, size):\n",
    "    vec = np.zeros(size).reshape((1, size))\n",
    "    count = 0.\n",
    "    for word in tokens:\n",
    "        try:\n",
    "            vec += model_dbow1[word].reshape((1, size)) * tfidf1[word]\n",
    "            count += 1.\n",
    "        except KeyError: \n",
    "            continue\n",
    "    if count != 0:\n",
    "        vec /= count\n",
    "    return vec\n",
    "\n",
    "def build_doc_Vector2(tokens, size):\n",
    "    vec = np.zeros(size).reshape((1, size))\n",
    "    count = 0.\n",
    "    for word in tokens:\n",
    "        try:\n",
    "            vec += model_dbow2[word].reshape((1, size)) * tfidf2[word]\n",
    "            count += 1.\n",
    "        except KeyError: \n",
    "            continue\n",
    "    if count != 0:\n",
    "        vec /= count\n",
    "    return vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import scale\n",
    "\n",
    "vecs1_trn_dbow = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_train1))])\n",
    "vecs1_trn_dbow = scale(vecs1_trn_dbow)\n",
    "vecs1_tst_dbow = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_validation1))])\n",
    "vecs1_tst_dbow = scale(vecs1_tst_dbow)\n",
    "\n",
    "vecs2_trn_dbow = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_train2))])\n",
    "vecs2_trn_dbow = scale(vecs2_trn_dbow)\n",
    "vecs2_tst_dbow = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_validation2))])\n",
    "vecs2_tst_dbow = scale(vecs2_tst_dbow)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vecs1_trn_dbow = pd.DataFrame(vecs1_trn_dbow,columns=['Synopsis_dbow_'+str(x) for x in range(1,300+1)])\n",
    "vecs1_tst_dbow = pd.DataFrame(vecs1_tst_dbow,columns=['Synopsis_dbow_'+str(x) for x in range(1,300+1)])\n",
    "\n",
    "vecs2_trn_dbow = pd.DataFrame(vecs2_trn_dbow,columns=['Title_dbow_'+str(x) for x in range(1,300+1)])\n",
    "vecs2_tst_dbow = pd.DataFrame(vecs2_tst_dbow,columns=['Title_dbow_'+str(x) for x in range(1,300+1)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time\n",
    "cores = multiprocessing.cpu_count()\n",
    "\n",
    "model_dmc1 = Doc2Vec(dm=1, dm_concat=1, size=300, window=2, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)\n",
    "model_dmc1.build_vocab([x for x in tqdm(all_x1_w2v)])\n",
    "model_dmc1.train(utils.shuffle([x for x in tqdm(all_x1_w2v)]), total_examples=len(all_x1_w2v), epochs=1)\n",
    "\n",
    "model_dmc2 = Doc2Vec(dm=1, dm_concat=1, size=300, window=2, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)\n",
    "model_dmc2.build_vocab([x for x in tqdm(all_x2_w2v)])\n",
    "model_dmc2.train(utils.shuffle([x for x in tqdm(all_x2_w2v)]), total_examples=len(all_x2_w2v), epochs=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_doc_Vector1(tokens, size):\n",
    "    vec = np.zeros(size).reshape((1, size))\n",
    "    count = 0.\n",
    "    for word in tokens:\n",
    "        try:\n",
    "            vec += model_dmc1[word].reshape((1, size)) * tfidf1[word]\n",
    "            count += 1.\n",
    "        except KeyError: \n",
    "            continue\n",
    "    if count != 0:\n",
    "        vec /= count\n",
    "    return vec\n",
    "\n",
    "def build_doc_Vector2(tokens, size):\n",
    "    vec = np.zeros(size).reshape((1, size))\n",
    "    count = 0.\n",
    "    for word in tokens:\n",
    "        try:\n",
    "            vec += model_dmc2[word].reshape((1, size)) * tfidf2[word]\n",
    "            count += 1.\n",
    "        except KeyError: \n",
    "            continue\n",
    "    if count != 0:\n",
    "        vec /= count\n",
    "    return vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vecs1_trn_dmc = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_train1))])\n",
    "vecs1_trn_dmc = scale(vecs1_trn_dmc)\n",
    "vecs1_tst_dmc = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_validation1))])\n",
    "vecs1_tst_dmc = scale(vecs1_tst_dmc)\n",
    "\n",
    "vecs2_trn_dmc = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_train2))])\n",
    "vecs2_trn_dmc = scale(vecs2_trn_dmc)\n",
    "vecs2_tst_dmc = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_validation2))])\n",
    "vecs2_tst_dmc = scale(vecs2_tst_dmc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vecs1_trn_dmc = pd.DataFrame(vecs1_trn_dmc,columns=['Synopsis_dmc_'+str(x) for x in range(1,300+1)])\n",
    "vecs1_tst_dmc = pd.DataFrame(vecs1_tst_dmc,columns=['Synopsis_dmc_'+str(x) for x in range(1,300+1)])\n",
    "\n",
    "vecs2_trn_dmc = pd.DataFrame(vecs2_trn_dmc,columns=['Title_dmc_'+str(x) for x in range(1,300+1)])\n",
    "vecs2_tst_dmc = pd.DataFrame(vecs2_tst_dmc,columns=['Title_dmc_'+str(x) for x in range(1,300+1)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cores = multiprocessing.cpu_count()\n",
    "\n",
    "model_dmm1 = Doc2Vec(dm=1, dm_mean=1, size=300, window=4, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)\n",
    "model_dmm1.build_vocab([x for x in tqdm(all_x1_w2v)])\n",
    "model_dmm1.train(utils.shuffle([x for x in tqdm(all_x1_w2v)]), total_examples=len(all_x1_w2v), epochs=1)\n",
    "\n",
    "model_dmm2 = Doc2Vec(dm=1, dm_mean=1, size=300, window=4, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)\n",
    "model_dmm2.build_vocab([x for x in tqdm(all_x2_w2v)])\n",
    "model_dmm2.train(utils.shuffle([x for x in tqdm(all_x2_w2v)]), total_examples=len(all_x2_w2v), epochs=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_doc_Vector1(tokens, size):\n",
    "    vec = np.zeros(size).reshape((1, size))\n",
    "    count = 0.\n",
    "    for word in tokens:\n",
    "        try:\n",
    "            vec += model_dmm1[word].reshape((1, size)) * tfidf1[word]\n",
    "            count += 1.\n",
    "        except KeyError: \n",
    "            continue\n",
    "    if count != 0:\n",
    "        vec /= count\n",
    "    return vec\n",
    "\n",
    "def build_doc_Vector2(tokens, size):\n",
    "    vec = np.zeros(size).reshape((1, size))\n",
    "    count = 0.\n",
    "    for word in tokens:\n",
    "        try:\n",
    "            vec += model_dmm2[word].reshape((1, size)) * tfidf2[word]\n",
    "            count += 1.\n",
    "        except KeyError: \n",
    "            continue\n",
    "    if count != 0:\n",
    "        vec /= count\n",
    "    return vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vecs1_trn_dmm = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_train1))])\n",
    "vecs1_trn_dmm = scale(vecs1_trn_dmm)\n",
    "vecs1_tst_dmm = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_validation1))])\n",
    "vecs1_tst_dmm = scale(vecs1_tst_dmm)\n",
    "\n",
    "vecs2_trn_dmm = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_train2))])\n",
    "vecs2_trn_dmm = scale(vecs2_trn_dmm)\n",
    "vecs2_tst_dmm = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_validation2))])\n",
    "vecs2_tst_dmm = scale(vecs2_tst_dmm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vecs1_trn_dmm = pd.DataFrame(vecs1_trn_dmm,columns=['Synopsis_dmm_'+str(x) for x in range(1,300+1)])\n",
    "vecs1_tst_dmm = pd.DataFrame(vecs1_tst_dmm,columns=['Synopsis_dmm_'+str(x) for x in range(1,300+1)])\n",
    "\n",
    "vecs2_trn_dmm = pd.DataFrame(vecs2_trn_dmm,columns=['Title_dmm_'+str(x) for x in range(1,300+1)])\n",
    "vecs2_tst_dmm = pd.DataFrame(vecs2_tst_dmm,columns=['Title_dmm_'+str(x) for x in range(1,300+1)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.concat([train,vecs1_trn_dbow,vecs2_trn_dbow,vecs1_trn_dmc,vecs2_trn_dmc,vecs1_trn_dmm,vecs2_trn_dmm], axis=1)\n",
    "test = pd.concat([test,vecs1_tst_dbow,vecs2_tst_dbow,vecs1_tst_dmc,vecs2_tst_dmc,vecs1_tst_dmm,vecs2_tst_dmm], axis=1)\n",
    "print(\"Train Shape : \",train.shape)\n",
    "print(\"Test Shape : \",test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FeatureNames2 = list(vecs1_trn_dbow.columns) + list(vecs2_trn_dbow.columns) + list(vecs1_trn_dmc.columns) + list(vecs2_trn_dmc.columns) + list(vecs1_trn_dmm.columns) + list(vecs2_trn_dmm.columns)\n",
    "FeatureNames2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_list = list(train.FOLD_NUM.unique())\n",
    "fold_list.sort()\n",
    "fold_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    trn_data = lgb.Dataset(temp_train_tf, label=temp_train['Price_Log'])\n",
    "    val_data = lgb.Dataset(temp_val_tf, label=temp_val['Price_Log'])\n",
    "    \n",
    "    param = {\n",
    "    'learning_rate': 0.01,\n",
    "    'max_depth': 14,\n",
    "    'min_data_in_leaf': 3,\n",
    "    'bagging_freq': 0,\n",
    "    'bagging_fraction': 0.95,\n",
    "    'feature_fraction': 0.2,\n",
    "    'boost': 'gbdt',    \n",
    "    'objective': 'regression',\n",
    "    'metric': 'rmse',\n",
    "    'seed': 392\n",
    "    }\n",
    "    \n",
    "    model = lgb.train(param, trn_data, 2000000,valid_sets = val_data, verbose_eval=50, early_stopping_rounds = 300)\n",
    "    \n",
    "    temp_val['Price_Log_Pred_LGB'] = model.predict(temp_val_tf, num_iteration=model.best_iteration)\n",
    "    \n",
    "    print(\"Fold RMSLE = \",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = model.predict(test[FeatureNames+FeatureNames2].values, num_iteration=model.best_iteration)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + model.predict(test[FeatureNames+FeatureNames2].values, num_iteration=model.best_iteration)\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = train[['id','FOLD_NUM','Price','Price_Log']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB']], on='id')\n",
    "print(training_cv_predictions.head())\n",
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_LGB'] = sub_data_preds\n",
    "test['Price_Log_Pred_LGB'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chk1 = pd.isnull(train[FeatureNames+FeatureNames2]).sum()\n",
    "chk1[chk1 > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chk1 = pd.isnull(test[FeatureNames+FeatureNames2]).sum()\n",
    "chk1[chk1 > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Ridge\n",
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    ridgereg = Ridge(alpha=0.45,normalize=True)\n",
    "    ridgereg.fit(temp_train_tf,temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_RIDGE'] = ridgereg.predict(temp_val_tf)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RIDGE'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = ridgereg.predict(test[FeatureNames+FeatureNames2].values)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + ridgereg.predict(test[FeatureNames+FeatureNames2].values)\n",
    "\n",
    "    IterationNum = IterationNum + 1\n",
    "\n",
    "print(\"RIDGE 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RIDGE']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_RIDGE'] = sub_data_preds\n",
    "test['Price_Log_Pred_RIDGE'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Lasso\n",
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    lassoreg = Lasso(alpha=0.00001,normalize=True, max_iter=1e6)\n",
    "    lassoreg.fit(temp_train_tf,temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_LASSO'] = lassoreg.predict(temp_val_tf)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LASSO'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = lassoreg.predict(test[FeatureNames+FeatureNames2].values)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + lassoreg.predict(test[FeatureNames+FeatureNames2].values)\n",
    "\n",
    "    IterationNum = IterationNum + 1\n",
    "    \n",
    "print(\"LASSO 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LASSO']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_LASSO'] = sub_data_preds\n",
    "test['Price_Log_Pred_LASSO'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    KNN = KNeighborsRegressor(n_neighbors = 10)\n",
    "    KNN.fit(temp_train_tf,temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_KNN'] = KNN.predict(temp_val_tf)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_KNN'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = KNN.predict(test[FeatureNames+FeatureNames2].values)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + KNN.predict(test[FeatureNames+FeatureNames2].values)\n",
    "\n",
    "    IterationNum = IterationNum + 1\n",
    "\n",
    "print(\"KNN 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_KNN']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_KNN'] = sub_data_preds\n",
    "test['Price_Log_Pred_KNN'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    RF = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 3, max_features = 30, random_state = 412,\n",
    "                               verbose = 0, max_depth = 40)\n",
    "    RF.fit(temp_train_tf,temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_RF'] = RF.predict(temp_val_tf)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RF'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = RF.predict(test[FeatureNames+FeatureNames2])\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + RF.predict(test[FeatureNames+FeatureNames2])\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"RF 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RF']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_RF'] = sub_data_preds\n",
    "test['Price_Log_Pred_RF'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\20191023_Stack21_DS.csv\", index=False)\n",
    "test.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\20191023_Stack21_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names_ensemble = list(training_cv_predictions.columns[4:])\n",
    "feature_names_ensemble"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "corr = training_cv_predictions[feature_names_ensemble + ['Price_Log']].corr()\n",
    "ax = sns.heatmap(\n",
    "    corr, \n",
    "    vmin=0, vmax=1, center=0.5,\n",
    "    cmap=sns.diverging_palette(20, 220, n=200),\n",
    "    square=True\n",
    ")\n",
    "ax.set_xticklabels(\n",
    "    ax.get_xticklabels(),\n",
    "    rotation=45,\n",
    "    horizontalalignment='right'\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = training_cv_predictions[training_cv_predictions['FOLD_NUM'] != fold_num]\n",
    "    temp_val = training_cv_predictions[training_cv_predictions['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    ridgereg_e = Ridge(alpha=0.00001,normalize=True)\n",
    "    ridgereg_e.fit(temp_train[feature_names_ensemble],temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_LGB_ENS'] = ridgereg_e.predict(temp_val[feature_names_ensemble])\n",
    "    \n",
    "    print(\"Fold RMSLE = \",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB_ENS'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = ridgereg_e.predict(test[feature_names_ensemble])\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + ridgereg_e.predict(test[feature_names_ensemble])\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"LGB ENS CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB_ENS']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_LGB_ENS'] = sub_data_preds\n",
    "test['Price_Log_Pred_LGB_ENS'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\20191023_Stack21_DS.csv\", index=False)\n",
    "test.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\20191023_Stack21_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')\n",
    "\n",
    "sub_data_preds2 = (10**test['Price_Log_Pred_LGB_ENS'].values) - 1\n",
    "print(pd.DataFrame(sub_data_preds2).describe())\n",
    "\n",
    "submission['Price'] = sub_data_preds2\n",
    "submission.to_excel('C:\\\\Kaggle\\\\BooksPrice\\\\Submissions\\\\20191023_Stack21_DS.xlsx', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
